; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s


; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half8:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT:    v_mov_b32_e32 v4, 0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: half8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX90A-NEXT:    s_endpgm
;
; GFX1030-LABEL: half8:
; GFX1030:       ; %bb.0:
; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX1030-NEXT:    s_waitcnt vmcnt(0)
; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX1030-NEXT:    s_endpgm
  %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
  %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
  %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
  %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
  %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
  %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
  %gep6 = getelementptr half, ptr addrspace(1) %0, i64 6
  %gep7 = getelementptr half, ptr addrspace(1) %0, i64 7
  %l0 = load half, ptr addrspace(1) %gep0, align 2
  %l1 = load half, ptr addrspace(1) %gep1, align 2
  %l2 = load half, ptr addrspace(1) %gep2, align 2
  %l3 = load half, ptr addrspace(1) %gep3, align 2
  %l4 = load half, ptr addrspace(1) %gep4, align 2
  %l5 = load half, ptr addrspace(1) %gep5, align 2
  %l6 = load half, ptr addrspace(1) %gep6, align 2
  %l7 = load half, ptr addrspace(1) %gep7, align 2
  %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
  %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
  %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
  %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
  %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
  %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
  %sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6
  %sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7
  store half %l0, ptr addrspace(1) %sgep0, align 2
  store half %l1, ptr addrspace(1) %sgep1, align 2
  store half %l2, ptr addrspace(1) %sgep2, align 2
  store half %l3, ptr addrspace(1) %sgep3, align 2
  store half %l4, ptr addrspace(1) %sgep4, align 2
  store half %l5, ptr addrspace(1) %sgep5, align 2
  store half %l6, ptr addrspace(1) %sgep6, align 2
  store half %l7, ptr addrspace(1) %sgep7, align 2
  ret void
}

; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half6:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT:    v_mov_b32_e32 v3, 0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: half6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX90A-NEXT:    s_endpgm
;
; GFX1030-LABEL: half6:
; GFX1030:       ; %bb.0:
; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
; GFX1030-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX1030-NEXT:    s_waitcnt vmcnt(0)
; GFX1030-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX1030-NEXT:    s_endpgm
  %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
  %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
  %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
  %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
  %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
  %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
  %l0 = load half, ptr addrspace(1) %gep0, align 1
  %l1 = load half, ptr addrspace(1) %gep1, align 1
  %l2 = load half, ptr addrspace(1) %gep2, align 1
  %l3 = load half, ptr addrspace(1) %gep3, align 1
  %l4 = load half, ptr addrspace(1) %gep4, align 1
  %l5 = load half, ptr addrspace(1) %gep5, align 1
  %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
  %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
  %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
  %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
  %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
  %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
  store half %l0, ptr addrspace(1) %sgep0, align 1
  store half %l1, ptr addrspace(1) %sgep1, align 1
  store half %l2, ptr addrspace(1) %sgep2, align 1
  store half %l3, ptr addrspace(1) %sgep3, align 1
  store half %l4, ptr addrspace(1) %sgep4, align 1
  store half %l5, ptr addrspace(1) %sgep5, align 1
  ret void
}

; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half4:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT:    v_mov_b32_e32 v2, 0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    v_mov_b32_e32 v0, s0
; GFX908-NEXT:    v_mov_b32_e32 v1, s1
; GFX908-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: half4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX90A-NEXT:    s_endpgm
;
; GFX1030-LABEL: half4:
; GFX1030:       ; %bb.0:
; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
; GFX1030-NEXT:    v_mov_b32_e32 v1, s1
; GFX1030-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030-NEXT:    s_endpgm
  %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
  %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
  %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
  %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
  %l0 = load half, ptr addrspace(1) %gep0, align 4
  %l1 = load half, ptr addrspace(1) %gep1, align 4
  %l2 = load half, ptr addrspace(1) %gep2, align 4
  %l3 = load half, ptr addrspace(1) %gep3, align 4
  %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
  %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
  %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
  %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
  store half %l0, ptr addrspace(1) %sgep0, align 4
  store half %l1, ptr addrspace(1) %sgep1, align 4
  store half %l2, ptr addrspace(1) %sgep2, align 4
  store half %l3, ptr addrspace(1) %sgep3, align 4
  ret void
}


; Function Attrs: mustprogress nounwind willreturn
define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
; GFX908-LABEL: half2:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT:    v_mov_b32_e32 v0, 0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    global_load_dword v1, v0, s[0:1]
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_store_dword v0, v1, s[2:3]
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: half2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    global_load_dword v1, v0, s[0:1]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
; GFX90A-NEXT:    s_endpgm
;
; GFX1030-LABEL: half2:
; GFX1030:       ; %bb.0:
; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
; GFX1030-NEXT:    global_load_dword v1, v0, s[0:1]
; GFX1030-NEXT:    s_waitcnt vmcnt(0)
; GFX1030-NEXT:    global_store_dword v0, v1, s[2:3]
; GFX1030-NEXT:    s_endpgm
  %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
  %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
  %l0 = load half, ptr addrspace(1) %gep0
  %l1 = load half, ptr addrspace(1) %gep1
  %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
  %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
  store half %l0, ptr addrspace(1) %sgep0
  store half %l1, ptr addrspace(1) %sgep1
  ret void
}


